Loading libraries
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
Reading data
data <- read.csv2('./all_summary.csv', nrows = 10000)
dim(data)
## [1] 10000 412
Processing missing data
required_columns <- c("res_name", "blob_volume_coverage", "blob_volume_coverage_second", "skeleton_density", "local_res_atom_non_h_count", "local_res_atom_non_h_electron_sum")
dim(data)
## [1] 10000 412
data <- data %>%
select(one_of(required_columns), contains("part_01")) %>%
drop_na()
dim(data)
## [1] 9756 112
Deleting chosen ligands
deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data %>% filter(!res_name %in% deletable_res_name)
dim(data)
## [1] 9696 112
Data summary
statistics <- data %>%
select(res_name, blob_volume_coverage, blob_volume_coverage_second, skeleton_density)
knitr::kable(summary(statistics))
|
SO4 :1005 |
1 : 133 |
0 :8186 |
0 :1094 |
|
GOL : 629 |
0.8571428571: 6 |
0.0243902439 : 2 |
1 : 952 |
|
EDO : 512 |
0.8333333333: 5 |
0.02523659306: 2 |
0.6666666667: 515 |
|
NAG : 452 |
0.8461538462: 5 |
0.0200661832 : 1 |
0.5 : 283 |
|
CL : 387 |
0.3266490765: 4 |
0.02009536785: 1 |
0.1666666667: 227 |
|
DMS : 340 |
0.75 : 4 |
0.02016883762: 1 |
0.1538461538: 222 |
|
(Other):6371 |
(Other) :9539 |
(Other) :1503 |
(Other) :6403 |
dim(data)
## [1] 9696 112
50 most popular ligands
popular_ligands <- data %>%
select(res_name) %>%
count(res_name, sort = TRUE) %>%
slice(1:50)
popular_names_vector <- popular_ligands %>%
pull(res_name)
data <- data %>% filter(res_name %in% popular_names_vector)
dim(data)
## [1] 6607 112
Cardinality of ligands by name
plot_ligands <- ggplot(popular_ligands, aes(x = reorder(res_name, -n), y = n, fill = n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
xlab("ligand")+
labs(title = "Cardinality of ligands by name")
ggplotly(plot_ligands)
Distribution of atom and electron count
plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) +
geom_density(alpha = .3, fill = "#00CECB", color = NA) +
xlab("atom count") +
labs(title = "Atom count distribution")
ggplotly(plot_atom)
plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) +
geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
xlab("electron count") +
labs(title = "Electron count distribution")
ggplotly(plot_electron)
Distribution of part_01 columns
plot_part_data <- data %>%
select(contains("part_01")) %>%
gather(part, value, 1:106)
## Warning: attributes are not identical across measure variables;
## they will be dropped
dim(plot_part_data)
## [1] 700342 2
# plot_part <- ggplot(plot_part_data, aes(x = part, y = value)) +
# geom_boxplot() +
# facet_wrap(~part)
# ggplotly(plot_part)